library(leaps)
library(MASS)
library(ggplot2)
require(caTools)
## Loading required package: caTools
data = read.csv('covid-project-raw-data/full_data_no_na.csv')
summary(data)
## X.1 X fips county
## Min. : 1.0 Min. : 3.0 Min. : 1001 Length:2944
## 1st Qu.: 761.8 1st Qu.: 842.8 1st Qu.:18182 Class :character
## Median :1506.5 Median :1598.5 Median :29100 Mode :character
## Mean :1539.2 Mean :1633.3 Mean :30355
## 3rd Qu.:2330.2 3rd Qu.:2442.5 3rd Qu.:45319
## Max. :3109.0 Max. :3234.0 Max. :56045
## state cases deaths mask_score
## Length:2944 Min. : 11.0 Min. : 0.00 Min. :1.433
## Class :character 1st Qu.: 469.5 1st Qu.: 6.00 1st Qu.:2.701
## Mode :character Median : 1101.5 Median : 17.00 Median :3.000
## Mean : 4073.6 Mean : 78.48 Mean :2.995
## 3rd Qu.: 2758.5 3rd Qu.: 48.00 3rd Qu.:3.296
## Max. :370729.0 Max. :7446.00 Max. :3.849
## ratio_pop_to_physician unemploy_rate HS_grad_rate flu_vacc_.
## Min. :-3936 Min. :0.01624 Min. :0.3630 Min. :0.0700
## 1st Qu.: 1048 1st Qu.:0.03567 1st Qu.:0.8467 1st Qu.:0.3500
## Median : 1498 Median :0.04388 Median :0.8951 Median :0.4200
## Mean : 1916 Mean :0.04619 Mean :0.8838 Mean :0.4087
## 3rd Qu.: 2202 3rd Qu.:0.05365 3rd Qu.:0.9344 3rd Qu.:0.4800
## Max. :51307 Max. :0.19066 Max. :1.0000 Max. :0.6500
## uninsured median_income pop_size X._rural
## Min. :0.02068 Min. : 24783 Min. : 1718 Min. :0.0000
## 1st Qu.:0.07036 1st Qu.: 42208 1st Qu.: 13074 1st Qu.:0.3222
## Median :0.10272 Median : 48728 Median : 28270 Median :0.5739
## Mean :0.10937 Mean : 50996 Mean : 107303 Mean :0.5678
## 3rd Qu.:0.13841 3rd Qu.: 56546 3rd Qu.: 73641 3rd Qu.:0.8149
## Max. :0.30877 Max. :136191 Max. :10163507 Max. :1.0000
## life_expectancy non_hispanic_african_american american_indian
## Min. :66.50 Min. :0.0005099 Min. :0.000496
## 1st Qu.:75.49 1st Qu.:0.0073038 1st Qu.:0.003656
## Median :77.48 Median :0.0244137 Median :0.005952
## Mean :77.40 Mean :0.0925949 Mean :0.019425
## 3rd Qu.:79.22 3rd Qu.:0.1061252 3rd Qu.:0.012304
## Max. :97.97 Max. :0.8532961 Max. :0.879781
## asian native_hawaiian hispanic non_hispanic_white
## Min. :0.000000 Min. :0.0000000 Min. :0.005151 Min. :0.0276
## 1st Qu.:0.004604 1st Qu.:0.0003171 1st Qu.:0.022798 1st Qu.:0.6470
## Median :0.007189 Median :0.0005949 Median :0.042757 Median :0.8371
## Mean :0.015009 Mean :0.0012039 Mean :0.094767 Mean :0.7647
## 3rd Qu.:0.014113 3rd Qu.:0.0011243 3rd Qu.:0.096909 3rd Qu.:0.9250
## Max. :0.430067 Max. :0.1312001 Max. :0.963230 Max. :0.9792
## sim_diversity_index X._democrat area pop_density
## Min. :0.04102 Min. :0.06955 Min. : 2.0 Min. : 0.50
## 1st Qu.:0.14154 1st Qu.:0.21860 1st Qu.: 426.7 1st Qu.: 20.98
## Median :0.28553 Median :0.30358 Median : 599.9 Median : 48.40
## Mean :0.31371 Mean :0.33789 Mean : 946.6 Mean : 214.13
## 3rd Qu.:0.48660 3rd Qu.:0.42621 3rd Qu.: 900.2 3rd Qu.: 123.85
## Max. :0.82897 Max. :0.95695 Max. :20056.9 Max. :17179.10
head(data)
## X.1 X fips county state cases deaths mask_score ratio_pop_to_physician
## 1 1 3 1001 Autauga Alabama 2634 39 3.003 3264.941
## 2 2 4 1003 Baldwin Alabama 8269 84 2.968 1915.568
## 3 3 5 1005 Barbour Alabama 1161 10 2.928 4211.667
## 4 4 6 1007 Bibb Alabama 1142 17 3.348 1079.429
## 5 5 7 1009 Blount Alabama 2763 36 2.892 5273.909
## 6 6 8 1011 Bullock Alabama 690 19 3.186 1288.625
## unemploy_rate HS_grad_rate flu_vacc_. uninsured median_income pop_size
## 1 0.03863522 0.9000000 0.41 0.08500966 58343 55504
## 2 0.03988336 0.8636158 0.45 0.10699288 56607 212628
## 3 0.05900923 0.8141026 0.37 0.12513197 32490 25270
## 4 0.04385140 0.8376384 0.39 0.09680075 45795 22668
## 5 0.04021393 0.9346880 0.38 0.12114004 48253 58013
## 6 0.04925187 0.7024793 0.26 0.13206483 29113 10309
## X._rural life_expectancy non_hispanic_african_american american_indian
## 1 0.4200216 76.33059 0.19254468 0.004756414
## 2 0.4227910 78.59950 0.08953195 0.007760032
## 3 0.6778963 75.77946 0.47942224 0.006529482
## 4 0.6835261 73.92827 0.21457561 0.004279160
## 5 0.8995150 74.59777 0.01460018 0.006326168
## 6 0.5137438 73.11753 0.69162867 0.008439228
## asian native_hawaiian hispanic non_hispanic_white sim_diversity_index
## 1 0.012791871 0.0010449697 0.02857452 0.7447391 0.4072863
## 2 0.011564799 0.0006866452 0.04550200 0.8304739 0.3000323
## 3 0.004629996 0.0018599129 0.04206569 0.4595568 0.5571248
## 4 0.002205753 0.0011469914 0.02638080 0.7429857 0.4012091
## 5 0.003016565 0.0011721511 0.09565097 0.8694431 0.2346560
## 6 0.001940052 0.0077602095 0.08245223 0.2130178 0.4693396
## X._democrat area pop_density
## 1 0.24622532 594.44 91.8
## 2 0.20207793 1589.78 114.6
## 3 0.47176755 884.88 31.0
## 4 0.21760334 622.58 36.8
## 5 0.08618829 644.78 88.9
## 6 0.75588865 622.81 17.5
drops = c('X', 'X.1','fips', 'county', 'state')
plot(data[ , !(names(data) %in% drops)][1:10])
colnames(data[ , !(names(data) %in% drops)])
## [1] "cases" "deaths"
## [3] "mask_score" "ratio_pop_to_physician"
## [5] "unemploy_rate" "HS_grad_rate"
## [7] "flu_vacc_." "uninsured"
## [9] "median_income" "pop_size"
## [11] "X._rural" "life_expectancy"
## [13] "non_hispanic_african_american" "american_indian"
## [15] "asian" "native_hawaiian"
## [17] "hispanic" "non_hispanic_white"
## [19] "sim_diversity_index" "X._democrat"
## [21] "area" "pop_density"
plot(data[ , !(names(data) %in% drops)][11:22])
summary(data[ , !(names(data) %in% drops)][1:22])
## cases deaths mask_score ratio_pop_to_physician
## Min. : 11.0 Min. : 0.00 Min. :1.433 Min. :-3936
## 1st Qu.: 469.5 1st Qu.: 6.00 1st Qu.:2.701 1st Qu.: 1048
## Median : 1101.5 Median : 17.00 Median :3.000 Median : 1498
## Mean : 4073.6 Mean : 78.48 Mean :2.995 Mean : 1916
## 3rd Qu.: 2758.5 3rd Qu.: 48.00 3rd Qu.:3.296 3rd Qu.: 2202
## Max. :370729.0 Max. :7446.00 Max. :3.849 Max. :51307
## unemploy_rate HS_grad_rate flu_vacc_. uninsured
## Min. :0.01624 Min. :0.3630 Min. :0.0700 Min. :0.02068
## 1st Qu.:0.03567 1st Qu.:0.8467 1st Qu.:0.3500 1st Qu.:0.07036
## Median :0.04388 Median :0.8951 Median :0.4200 Median :0.10272
## Mean :0.04619 Mean :0.8838 Mean :0.4087 Mean :0.10937
## 3rd Qu.:0.05365 3rd Qu.:0.9344 3rd Qu.:0.4800 3rd Qu.:0.13841
## Max. :0.19066 Max. :1.0000 Max. :0.6500 Max. :0.30877
## median_income pop_size X._rural life_expectancy
## Min. : 24783 Min. : 1718 Min. :0.0000 Min. :66.50
## 1st Qu.: 42208 1st Qu.: 13074 1st Qu.:0.3222 1st Qu.:75.49
## Median : 48728 Median : 28270 Median :0.5739 Median :77.48
## Mean : 50996 Mean : 107303 Mean :0.5678 Mean :77.40
## 3rd Qu.: 56546 3rd Qu.: 73641 3rd Qu.:0.8149 3rd Qu.:79.22
## Max. :136191 Max. :10163507 Max. :1.0000 Max. :97.97
## non_hispanic_african_american american_indian asian
## Min. :0.0005099 Min. :0.000496 Min. :0.000000
## 1st Qu.:0.0073038 1st Qu.:0.003656 1st Qu.:0.004604
## Median :0.0244137 Median :0.005952 Median :0.007189
## Mean :0.0925949 Mean :0.019425 Mean :0.015009
## 3rd Qu.:0.1061252 3rd Qu.:0.012304 3rd Qu.:0.014113
## Max. :0.8532961 Max. :0.879781 Max. :0.430067
## native_hawaiian hispanic non_hispanic_white sim_diversity_index
## Min. :0.0000000 Min. :0.005151 Min. :0.0276 Min. :0.04102
## 1st Qu.:0.0003171 1st Qu.:0.022798 1st Qu.:0.6470 1st Qu.:0.14154
## Median :0.0005949 Median :0.042757 Median :0.8371 Median :0.28553
## Mean :0.0012039 Mean :0.094767 Mean :0.7647 Mean :0.31371
## 3rd Qu.:0.0011243 3rd Qu.:0.096909 3rd Qu.:0.9250 3rd Qu.:0.48660
## Max. :0.1312001 Max. :0.963230 Max. :0.9792 Max. :0.82897
## X._democrat area pop_density
## Min. :0.06955 Min. : 2.0 Min. : 0.50
## 1st Qu.:0.21860 1st Qu.: 426.7 1st Qu.: 20.98
## Median :0.30358 Median : 599.9 Median : 48.40
## Mean :0.33789 Mean : 946.6 Mean : 214.13
## 3rd Qu.:0.42621 3rd Qu.: 900.2 3rd Qu.: 123.85
## Max. :0.95695 Max. :20056.9 Max. :17179.10
#testing
library(reshape2)
drops = c('X', 'X.1', 'county', 'state')
df.m <- melt(data[ , !(names(data) %in% drops)], id.var = "fips")
p <- ggplot(data = df.m, aes(x=variable, y=value)) +
geom_boxplot(aes(fill=fips))
p + facet_wrap( ~ variable, scales="free")
sapply(data, class)
## X.1 X
## "integer" "integer"
## fips county
## "integer" "character"
## state cases
## "character" "integer"
## deaths mask_score
## "integer" "numeric"
## ratio_pop_to_physician unemploy_rate
## "numeric" "numeric"
## HS_grad_rate flu_vacc_.
## "numeric" "numeric"
## uninsured median_income
## "numeric" "integer"
## pop_size X._rural
## "integer" "numeric"
## life_expectancy non_hispanic_african_american
## "numeric" "numeric"
## american_indian asian
## "numeric" "numeric"
## native_hawaiian hispanic
## "numeric" "numeric"
## non_hispanic_white sim_diversity_index
## "numeric" "numeric"
## X._democrat area
## "numeric" "numeric"
## pop_density
## "numeric"
basic_model = lm(cases ~ state + mask_score + ratio_pop_to_physician + unemploy_rate + HS_grad_rate + flu_vacc_. + uninsured + median_income + pop_size + X._rural + life_expectancy + sim_diversity_index + area + X._democrat + life_expectancy + pop_density + non_hispanic_african_american + american_indian + asian + native_hawaiian + hispanic + non_hispanic_white, data = data)
summary(basic_model)
##
## Call:
## lm(formula = cases ~ state + mask_score + ratio_pop_to_physician +
## unemploy_rate + HS_grad_rate + flu_vacc_. + uninsured + median_income +
## pop_size + X._rural + life_expectancy + sim_diversity_index +
## area + X._democrat + life_expectancy + pop_density + non_hispanic_african_american +
## american_indian + asian + native_hawaiian + hispanic + non_hispanic_white,
## data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -51341 -785 -93 602 97809
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.080e+04 1.165e+04 -0.927 0.353809
## stateAlaska 8.885e+03 3.975e+03 2.235 0.025486 *
## stateArizona -2.149e+03 1.261e+03 -1.704 0.088577 .
## stateArkansas -2.618e+02 6.513e+02 -0.402 0.687721
## stateCalifornia -6.742e+03 8.667e+02 -7.779 1.01e-14 ***
## stateColorado -2.356e+03 7.786e+02 -3.026 0.002498 **
## stateConnecticut -5.054e+03 1.460e+03 -3.462 0.000543 ***
## stateDelaware -2.943e+03 2.227e+03 -1.322 0.186430
## stateDistrict of Columbia -1.513e+04 3.945e+03 -3.835 0.000128 ***
## stateFlorida -1.777e+02 7.022e+02 -0.253 0.800208
## stateGeorgia -6.235e+02 5.860e+02 -1.064 0.287376
## stateHawaii 1.477e+04 5.370e+03 2.751 0.005983 **
## stateIdaho -5.391e+02 8.546e+02 -0.631 0.528218
## stateIllinois 9.397e+02 6.744e+02 1.393 0.163595
## stateIndiana 2.517e+02 6.473e+02 0.389 0.697448
## stateIowa 8.015e+02 7.142e+02 1.122 0.261883
## stateKansas -3.285e+02 6.405e+02 -0.513 0.608097
## stateKentucky -4.557e+02 6.510e+02 -0.700 0.484049
## stateLouisiana -5.453e+02 6.690e+02 -0.815 0.415119
## stateMaine -2.890e+03 1.118e+03 -2.586 0.009757 **
## stateMaryland -2.208e+03 9.302e+02 -2.374 0.017649 *
## stateMassachusetts -5.768e+03 1.214e+03 -4.752 2.11e-06 ***
## stateMichigan -1.620e+03 7.128e+02 -2.272 0.023160 *
## stateMinnesota 5.050e+02 7.425e+02 0.680 0.496483
## stateMississippi -5.269e+02 6.537e+02 -0.806 0.420323
## stateMissouri 6.647e+01 6.269e+02 0.106 0.915559
## stateMontana -1.515e+02 8.116e+02 -0.187 0.851900
## stateNebraska -1.089e+02 6.916e+02 -0.157 0.874924
## stateNevada 2.408e+02 1.219e+03 0.198 0.843393
## stateNew Hampshire -3.043e+03 1.331e+03 -2.286 0.022356 *
## stateNew Jersey -1.387e+03 1.016e+03 -1.365 0.172391
## stateNew Mexico -4.764e+03 9.931e+02 -4.797 1.70e-06 ***
## stateNew York -3.254e+03 7.825e+02 -4.159 3.29e-05 ***
## stateNorth Carolina -1.875e+03 6.217e+02 -3.016 0.002581 **
## stateNorth Dakota 2.445e+03 1.209e+03 2.022 0.043275 *
## stateOhio -1.617e+03 6.754e+02 -2.395 0.016688 *
## stateOklahoma -5.249e+02 8.601e+02 -0.610 0.541741
## stateOregon -3.933e+03 8.976e+02 -4.382 1.22e-05 ***
## statePennsylvania -3.744e+03 7.228e+02 -5.180 2.37e-07 ***
## stateRhode Island -1.702e+03 1.798e+03 -0.947 0.343865
## stateSouth Carolina -8.345e+02 7.260e+02 -1.149 0.250522
## stateSouth Dakota 4.649e+02 7.560e+02 0.615 0.538583
## stateTennessee 4.004e+02 6.255e+02 0.640 0.522204
## stateTexas -2.663e+03 6.530e+02 -4.077 4.68e-05 ***
## stateUtah 1.155e+03 9.219e+02 1.253 0.210483
## stateVermont -1.796e+03 1.297e+03 -1.385 0.166300
## stateVirginia -1.436e+03 5.980e+02 -2.401 0.016423 *
## stateWashington -4.189e+03 8.734e+02 -4.796 1.70e-06 ***
## stateWest Virginia -1.029e+03 7.512e+02 -1.370 0.170818
## stateWisconsin 1.853e+03 7.396e+02 2.505 0.012303 *
## stateWyoming -5.560e+02 1.011e+03 -0.550 0.582271
## mask_score 4.137e+02 3.107e+02 1.332 0.183092
## ratio_pop_to_physician -5.566e-02 3.973e-02 -1.401 0.161365
## unemploy_rate 1.051e+04 7.019e+03 1.498 0.134359
## HS_grad_rate -1.043e+03 1.315e+03 -0.793 0.427626
## flu_vacc_. 1.347e+03 1.026e+03 1.314 0.188988
## uninsured -1.149e+03 4.343e+03 -0.265 0.791294
## median_income -2.338e-02 1.005e-02 -2.326 0.020098 *
## pop_size 4.057e-02 2.596e-04 156.305 < 2e-16 ***
## X._rural -9.170e+02 3.648e+02 -2.514 0.011986 *
## life_expectancy 1.352e+02 4.162e+01 3.247 0.001178 **
## sim_diversity_index 1.863e+03 9.891e+02 1.884 0.059715 .
## area 1.805e-02 7.895e-02 0.229 0.819143
## X._democrat -3.167e+02 1.052e+03 -0.301 0.763428
## pop_density 7.973e-01 1.220e-01 6.533 7.59e-11 ***
## non_hispanic_african_american 1.360e+03 1.114e+04 0.122 0.902878
## american_indian 1.655e+03 1.133e+04 0.146 0.883938
## asian -7.723e+04 1.236e+04 -6.247 4.79e-10 ***
## native_hawaiian -3.299e+04 4.739e+04 -0.696 0.486302
## hispanic 8.683e+03 1.087e+04 0.799 0.424328
## non_hispanic_white 1.560e+03 1.148e+04 0.136 0.891904
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3741 on 2873 degrees of freedom
## Multiple R-squared: 0.9262, Adjusted R-squared: 0.9244
## F-statistic: 514.8 on 70 and 2873 DF, p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(basic_model)
## Warning: not plotting observations with leverage one:
## 68, 283
boxcox(basic_model)
hist(data$cases, breaks = 200)
hist(log(data$cases), breaks = 200)
cases is needed. The same basic model with the transformed cases response variable was then computed to understand the improvements to the model.log_model = lm(log(cases) ~ state + mask_score + ratio_pop_to_physician + unemploy_rate + HS_grad_rate + flu_vacc_. + uninsured + median_income + pop_size + X._rural + life_expectancy + sim_diversity_index + area + X._democrat + life_expectancy + pop_density + non_hispanic_african_american + american_indian + asian + native_hawaiian + hispanic + non_hispanic_white, data = data)
summary(log_model)
##
## Call:
## lm(formula = log(cases) ~ state + mask_score + ratio_pop_to_physician +
## unemploy_rate + HS_grad_rate + flu_vacc_. + uninsured + median_income +
## pop_size + X._rural + life_expectancy + sim_diversity_index +
## area + X._democrat + life_expectancy + pop_density + non_hispanic_african_american +
## american_indian + asian + native_hawaiian + hispanic + non_hispanic_white,
## data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.2257 -0.3917 0.0555 0.4466 2.6514
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.511e+00 2.161e+00 3.476 0.000517 ***
## stateAlaska -1.724e+00 7.376e-01 -2.337 0.019524 *
## stateArizona -1.229e+00 2.340e-01 -5.254 1.60e-07 ***
## stateArkansas -6.024e-01 1.208e-01 -4.985 6.55e-07 ***
## stateCalifornia -1.858e+00 1.608e-01 -11.554 < 2e-16 ***
## stateColorado -1.810e+00 1.445e-01 -12.531 < 2e-16 ***
## stateConnecticut -8.421e-01 2.708e-01 -3.109 0.001896 **
## stateDelaware -4.424e-01 4.132e-01 -1.071 0.284382
## stateDistrict of Columbia -9.940e-01 7.320e-01 -1.358 0.174600
## stateFlorida -2.057e-01 1.303e-01 -1.579 0.114520
## stateGeorgia -8.059e-01 1.087e-01 -7.413 1.62e-13 ***
## stateHawaii -7.262e+00 9.964e-01 -7.289 4.03e-13 ***
## stateIdaho -9.190e-01 1.586e-01 -5.796 7.53e-09 ***
## stateIllinois -6.896e-01 1.251e-01 -5.511 3.88e-08 ***
## stateIndiana -5.565e-01 1.201e-01 -4.633 3.77e-06 ***
## stateIowa -6.498e-01 1.325e-01 -4.904 9.94e-07 ***
## stateKansas -1.307e+00 1.189e-01 -10.994 < 2e-16 ***
## stateKentucky -7.489e-01 1.208e-01 -6.199 6.49e-10 ***
## stateLouisiana -4.306e-01 1.241e-01 -3.469 0.000531 ***
## stateMaine -1.777e+00 2.074e-01 -8.570 < 2e-16 ***
## stateMaryland -1.154e+00 1.726e-01 -6.686 2.75e-11 ***
## stateMassachusetts -1.356e+00 2.252e-01 -6.022 1.94e-09 ***
## stateMichigan -5.552e-01 1.323e-01 -4.198 2.78e-05 ***
## stateMinnesota -6.922e-01 1.378e-01 -5.025 5.35e-07 ***
## stateMississippi -3.192e-01 1.213e-01 -2.632 0.008541 **
## stateMissouri -4.687e-01 1.163e-01 -4.030 5.73e-05 ***
## stateMontana -1.206e+00 1.506e-01 -8.007 1.70e-15 ***
## stateNebraska -1.143e+00 1.283e-01 -8.904 < 2e-16 ***
## stateNevada -2.834e+00 2.261e-01 -12.531 < 2e-16 ***
## stateNew Hampshire -1.453e+00 2.470e-01 -5.881 4.54e-09 ***
## stateNew Jersey -1.034e+00 1.885e-01 -5.485 4.50e-08 ***
## stateNew Mexico -2.012e+00 1.843e-01 -10.916 < 2e-16 ***
## stateNew York -1.367e+00 1.452e-01 -9.414 < 2e-16 ***
## stateNorth Carolina -6.134e-01 1.154e-01 -5.318 1.13e-07 ***
## stateNorth Dakota -6.513e-01 2.243e-01 -2.903 0.003725 **
## stateOhio -5.446e-01 1.253e-01 -4.346 1.44e-05 ***
## stateOklahoma -8.593e-01 1.596e-01 -5.384 7.86e-08 ***
## stateOregon -2.186e+00 1.665e-01 -13.126 < 2e-16 ***
## statePennsylvania -8.953e-01 1.341e-01 -6.675 2.95e-11 ***
## stateRhode Island -1.505e+00 3.337e-01 -4.510 6.75e-06 ***
## stateSouth Carolina -2.000e-01 1.347e-01 -1.485 0.137733
## stateSouth Dakota -9.209e-01 1.403e-01 -6.565 6.14e-11 ***
## stateTennessee -1.426e-01 1.161e-01 -1.228 0.219410
## stateTexas -1.471e+00 1.212e-01 -12.137 < 2e-16 ***
## stateUtah -1.655e+00 1.710e-01 -9.677 < 2e-16 ***
## stateVermont -2.112e+00 2.407e-01 -8.776 < 2e-16 ***
## stateVirginia -1.609e+00 1.110e-01 -14.505 < 2e-16 ***
## stateWashington -1.778e+00 1.621e-01 -10.971 < 2e-16 ***
## stateWest Virginia -1.166e+00 1.394e-01 -8.362 < 2e-16 ***
## stateWisconsin -1.899e-02 1.372e-01 -0.138 0.889970
## stateWyoming -1.720e+00 1.875e-01 -9.174 < 2e-16 ***
## mask_score 5.334e-02 5.764e-02 0.925 0.354810
## ratio_pop_to_physician -2.397e-05 7.372e-06 -3.251 0.001162 **
## unemploy_rate 2.294e+00 1.302e+00 1.761 0.078340 .
## HS_grad_rate -6.235e-01 2.439e-01 -2.556 0.010635 *
## flu_vacc_. 2.497e+00 1.903e-01 13.122 < 2e-16 ***
## uninsured 9.895e-01 8.059e-01 1.228 0.219607
## median_income 1.675e-05 1.866e-06 8.980 < 2e-16 ***
## pop_size 1.036e-06 4.816e-08 21.504 < 2e-16 ***
## X._rural -2.290e+00 6.768e-02 -33.832 < 2e-16 ***
## life_expectancy -3.685e-02 7.723e-03 -4.772 1.92e-06 ***
## sim_diversity_index 8.284e-01 1.835e-01 4.514 6.63e-06 ***
## area 9.784e-05 1.465e-05 6.679 2.88e-11 ***
## X._democrat 1.641e-01 1.952e-01 0.841 0.400635
## pop_density 3.744e-05 2.264e-05 1.653 0.098342 .
## non_hispanic_african_american 2.022e+00 2.067e+00 0.978 0.327970
## american_indian 2.617e+00 2.103e+00 1.244 0.213423
## asian 3.576e+00 2.294e+00 1.559 0.119125
## native_hawaiian 4.570e+01 8.792e+00 5.198 2.16e-07 ***
## hispanic 2.723e+00 2.016e+00 1.351 0.176946
## non_hispanic_white 2.457e+00 2.130e+00 1.153 0.248818
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6942 on 2873 degrees of freedom
## Multiple R-squared: 0.773, Adjusted R-squared: 0.7674
## F-statistic: 139.7 on 70 and 2873 DF, p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(log_model)
## Warning: not plotting observations with leverage one:
## 68, 283
- The log transformation of our response variable, cases, produces a much better Normal Q-Q but it is still very heavy tailed on the negative side of the standardized residuals as seen in the code and output in appendix 2. Though, the model has shown improvement with these steps we see one of the main predictors in the model appears to be population size or
pop_size which is not very informative in the final model that would be useful in constructing. Therefore, it is reasonable to consider normalizing the total cases based on population size.
data_norm = data
data_norm$cases_norm = data_norm$cases/data_norm$pop_size
norm_model = lm(cases_norm ~ state + mask_score + ratio_pop_to_physician + unemploy_rate + HS_grad_rate + flu_vacc_. + uninsured + median_income + X._rural + life_expectancy + sim_diversity_index + area + X._democrat + life_expectancy + pop_density + non_hispanic_african_american + american_indian + asian + native_hawaiian + hispanic + non_hispanic_white, data = data_norm)
# setting this up for usage later
set.seed(10)
sample = sample.split(data_norm,SplitRatio = 0.5)
data.t = subset(data_norm,sample ==TRUE)
data.v = subset(data_norm,sample ==FALSE)
boxcox(norm_model)
par(mfrow = c(2,2))
plot(norm_model)
## Warning: not plotting observations with leverage one:
## 68, 283
hist(data_norm$cases_norm, breaks = 200)
hist(sqrt(data_norm$cases_norm), breaks = 200)
sqrt_norm_model = lm(sqrt(cases_norm) ~ state + mask_score + ratio_pop_to_physician + unemploy_rate + HS_grad_rate + flu_vacc_. + uninsured + median_income + X._rural + life_expectancy + area + X._democrat + sim_diversity_index + life_expectancy + pop_density + non_hispanic_african_american + american_indian + asian + native_hawaiian + hispanic + non_hispanic_white, data = data_norm)
summary(sqrt_norm_model)
##
## Call:
## lm(formula = sqrt(cases_norm) ~ state + mask_score + ratio_pop_to_physician +
## unemploy_rate + HS_grad_rate + flu_vacc_. + uninsured + median_income +
## X._rural + life_expectancy + area + X._democrat + sim_diversity_index +
## life_expectancy + pop_density + non_hispanic_african_american +
## american_indian + asian + native_hawaiian + hispanic + non_hispanic_white,
## data = data_norm)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.107070 -0.018932 -0.001631 0.016028 0.223177
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.621e-01 9.794e-02 -5.739 1.05e-08 ***
## stateAlaska 4.076e-02 3.342e-02 1.219 0.222755
## stateArizona -4.093e-02 1.061e-02 -3.859 0.000116 ***
## stateArkansas -1.361e-06 5.477e-03 0.000 0.999802
## stateCalifornia -7.600e-02 7.283e-03 -10.435 < 2e-16 ***
## stateColorado -5.367e-02 6.546e-03 -8.198 3.63e-16 ***
## stateConnecticut -3.819e-02 1.227e-02 -3.112 0.001875 **
## stateDelaware -2.613e-02 1.873e-02 -1.395 0.163018
## stateDistrict of Columbia -3.668e-02 3.316e-02 -1.106 0.268736
## stateFlorida -2.485e-03 5.899e-03 -0.421 0.673592
## stateGeorgia -1.353e-02 4.926e-03 -2.746 0.006078 **
## stateHawaii -1.361e-01 4.506e-02 -3.019 0.002554 **
## stateIdaho 4.181e-03 7.186e-03 0.582 0.560767
## stateIllinois 2.428e-02 5.668e-03 4.284 1.89e-05 ***
## stateIndiana 3.798e-03 5.444e-03 0.698 0.485448
## stateIowa 5.223e-02 6.006e-03 8.695 < 2e-16 ***
## stateKansas 1.782e-02 5.387e-03 3.309 0.000948 ***
## stateKentucky -1.444e-02 5.473e-03 -2.638 0.008390 **
## stateLouisiana 1.835e-03 5.626e-03 0.326 0.744296
## stateMaine -9.074e-02 9.397e-03 -9.656 < 2e-16 ***
## stateMaryland -4.063e-02 7.821e-03 -5.195 2.19e-07 ***
## stateMassachusetts -3.433e-02 1.021e-02 -3.363 0.000780 ***
## stateMichigan -8.421e-03 5.992e-03 -1.405 0.159996
## stateMinnesota 2.734e-02 6.244e-03 4.380 1.23e-05 ***
## stateMississippi 2.373e-03 5.497e-03 0.432 0.665986
## stateMissouri 9.972e-03 5.271e-03 1.892 0.058612 .
## stateMontana 2.899e-02 6.825e-03 4.248 2.23e-05 ***
## stateNebraska 1.524e-02 5.816e-03 2.621 0.008815 **
## stateNevada -6.629e-02 1.023e-02 -6.477 1.10e-10 ***
## stateNew Hampshire -7.590e-02 1.120e-02 -6.779 1.46e-11 ***
## stateNew Jersey -4.059e-02 8.532e-03 -4.757 2.06e-06 ***
## stateNew Mexico -8.404e-02 8.344e-03 -10.072 < 2e-16 ***
## stateNew York -6.359e-02 6.580e-03 -9.664 < 2e-16 ***
## stateNorth Carolina -3.303e-02 5.228e-03 -6.318 3.06e-10 ***
## stateNorth Dakota 8.842e-02 1.017e-02 8.696 < 2e-16 ***
## stateOhio -2.265e-02 5.676e-03 -3.990 6.77e-05 ***
## stateOklahoma 1.759e-02 7.233e-03 2.431 0.015098 *
## stateOregon -7.091e-02 7.546e-03 -9.397 < 2e-16 ***
## statePennsylvania -5.237e-02 6.076e-03 -8.618 < 2e-16 ***
## stateRhode Island -2.829e-02 1.512e-02 -1.871 0.061453 .
## stateSouth Carolina -2.219e-02 6.106e-03 -3.634 0.000284 ***
## stateSouth Dakota 7.617e-02 6.357e-03 11.982 < 2e-16 ***
## stateTennessee 2.359e-02 5.260e-03 4.484 7.60e-06 ***
## stateTexas -5.743e-02 5.488e-03 -10.465 < 2e-16 ***
## stateUtah -2.416e-02 7.752e-03 -3.116 0.001849 **
## stateVermont -8.616e-02 1.091e-02 -7.899 3.98e-15 ***
## stateVirginia -4.222e-02 5.018e-03 -8.413 < 2e-16 ***
## stateWashington -5.640e-02 7.345e-03 -7.679 2.18e-14 ***
## stateWest Virginia -5.279e-02 6.315e-03 -8.360 < 2e-16 ***
## stateWisconsin 5.752e-02 6.220e-03 9.248 < 2e-16 ***
## stateWyoming -6.723e-04 8.493e-03 -0.079 0.936912
## mask_score -1.333e-02 2.609e-03 -5.108 3.47e-07 ***
## ratio_pop_to_physician -8.594e-07 3.341e-07 -2.572 0.010149 *
## unemploy_rate -2.469e-01 5.898e-02 -4.186 2.92e-05 ***
## HS_grad_rate 1.572e-02 1.105e-02 1.423 0.154904
## flu_vacc_. 3.769e-02 8.624e-03 4.370 1.28e-05 ***
## uninsured -6.465e-02 3.650e-02 -1.771 0.076633 .
## median_income -3.500e-07 8.454e-08 -4.141 3.56e-05 ***
## X._rural -2.178e-02 3.062e-03 -7.111 1.44e-12 ***
## life_expectancy 1.014e-05 3.500e-04 0.029 0.976895
## area 1.670e-07 6.569e-07 0.254 0.799282
## X._democrat -8.072e-02 8.845e-03 -9.126 < 2e-16 ***
## sim_diversity_index 1.304e-02 8.318e-03 1.567 0.117126
## pop_density 7.062e-07 1.009e-06 0.700 0.484073
## non_hispanic_african_american 9.581e-01 9.368e-02 10.228 < 2e-16 ***
## american_indian 9.632e-01 9.529e-02 10.108 < 2e-16 ***
## asian 9.351e-01 1.037e-01 9.020 < 2e-16 ***
## native_hawaiian 2.249e+00 3.985e-01 5.645 1.81e-08 ***
## hispanic 9.772e-01 9.137e-02 10.694 < 2e-16 ***
## non_hispanic_white 8.347e-01 9.654e-02 8.647 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.03146 on 2874 degrees of freedom
## Multiple R-squared: 0.6574, Adjusted R-squared: 0.6492
## F-statistic: 79.94 on 69 and 2874 DF, p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(sqrt_norm_model)
## Warning: not plotting observations with leverage one:
## 68, 283
- After transforming the covid cased normalized by population size the model Q-Q plot is not great…… Now that basic model exploration has been conducted more advanced selection techniques will be use such as regsubsets in R as well as stepAIC. Regsubsets was performed using nvmax=10 and nbest=6 represent the maximum size of subsets to be examined and the number of subsets of each size to record, respectively. Regsubsets was performed using forward and backward stepwise search since the exhaustive search with second order interactions was deemed slow by the leaps library in R.
sqrt_norm_model = lm(sqrt(cases_norm) ~ mask_score + ratio_pop_to_physician + unemploy_rate + HS_grad_rate + flu_vacc_. + uninsured + median_income + X._rural + life_expectancy + area + X._democrat + sim_diversity_index + life_expectancy + pop_density + non_hispanic_african_american + american_indian + asian + native_hawaiian + hispanic + non_hispanic_white, data = data_norm)
summary(sqrt_norm_model)
##
## Call:
## lm(formula = sqrt(cases_norm) ~ mask_score + ratio_pop_to_physician +
## unemploy_rate + HS_grad_rate + flu_vacc_. + uninsured + median_income +
## X._rural + life_expectancy + area + X._democrat + sim_diversity_index +
## life_expectancy + pop_density + non_hispanic_african_american +
## american_indian + asian + native_hawaiian + hispanic + non_hispanic_white,
## data = data_norm)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.145860 -0.025914 -0.001729 0.023545 0.243744
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.430e-01 9.467e-02 -4.679 3.01e-06 ***
## mask_score -6.594e-02 2.525e-03 -26.115 < 2e-16 ***
## ratio_pop_to_physician -1.196e-06 4.217e-07 -2.836 0.00461 **
## unemploy_rate -6.371e-01 6.406e-02 -9.946 < 2e-16 ***
## HS_grad_rate 2.301e-02 1.244e-02 1.850 0.06443 .
## flu_vacc_. 5.001e-02 1.005e-02 4.975 6.92e-07 ***
## uninsured -1.500e-01 2.592e-02 -5.788 7.89e-09 ***
## median_income -4.417e-07 9.907e-08 -4.458 8.58e-06 ***
## X._rural -3.022e-02 3.651e-03 -8.277 < 2e-16 ***
## life_expectancy 1.710e-03 4.031e-04 4.242 2.29e-05 ***
## area -3.602e-06 6.563e-07 -5.487 4.43e-08 ***
## X._democrat -6.849e-02 9.510e-03 -7.202 7.53e-13 ***
## sim_diversity_index -1.452e-02 9.720e-03 -1.494 0.13540
## pop_density -2.260e-07 1.219e-06 -0.185 0.85293
## non_hispanic_african_american 9.370e-01 8.890e-02 10.540 < 2e-16 ***
## american_indian 1.022e+00 9.224e-02 11.085 < 2e-16 ***
## asian 7.943e-01 1.034e-01 7.685 2.08e-14 ***
## native_hawaiian 1.482e+00 2.965e-01 4.999 6.10e-07 ***
## hispanic 8.800e-01 8.620e-02 10.209 < 2e-16 ***
## non_hispanic_white 7.599e-01 9.206e-02 8.255 2.28e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.04098 on 2924 degrees of freedom
## Multiple R-squared: 0.4087, Adjusted R-squared: 0.4049
## F-statistic: 106.4 on 19 and 2924 DF, p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(sqrt_norm_model)
summary(sqrt_norm_model)
##
## Call:
## lm(formula = sqrt(cases_norm) ~ mask_score + ratio_pop_to_physician +
## unemploy_rate + HS_grad_rate + flu_vacc_. + uninsured + median_income +
## X._rural + life_expectancy + area + X._democrat + sim_diversity_index +
## life_expectancy + pop_density + non_hispanic_african_american +
## american_indian + asian + native_hawaiian + hispanic + non_hispanic_white,
## data = data_norm)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.145860 -0.025914 -0.001729 0.023545 0.243744
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.430e-01 9.467e-02 -4.679 3.01e-06 ***
## mask_score -6.594e-02 2.525e-03 -26.115 < 2e-16 ***
## ratio_pop_to_physician -1.196e-06 4.217e-07 -2.836 0.00461 **
## unemploy_rate -6.371e-01 6.406e-02 -9.946 < 2e-16 ***
## HS_grad_rate 2.301e-02 1.244e-02 1.850 0.06443 .
## flu_vacc_. 5.001e-02 1.005e-02 4.975 6.92e-07 ***
## uninsured -1.500e-01 2.592e-02 -5.788 7.89e-09 ***
## median_income -4.417e-07 9.907e-08 -4.458 8.58e-06 ***
## X._rural -3.022e-02 3.651e-03 -8.277 < 2e-16 ***
## life_expectancy 1.710e-03 4.031e-04 4.242 2.29e-05 ***
## area -3.602e-06 6.563e-07 -5.487 4.43e-08 ***
## X._democrat -6.849e-02 9.510e-03 -7.202 7.53e-13 ***
## sim_diversity_index -1.452e-02 9.720e-03 -1.494 0.13540
## pop_density -2.260e-07 1.219e-06 -0.185 0.85293
## non_hispanic_african_american 9.370e-01 8.890e-02 10.540 < 2e-16 ***
## american_indian 1.022e+00 9.224e-02 11.085 < 2e-16 ***
## asian 7.943e-01 1.034e-01 7.685 2.08e-14 ***
## native_hawaiian 1.482e+00 2.965e-01 4.999 6.10e-07 ***
## hispanic 8.800e-01 8.620e-02 10.209 < 2e-16 ***
## non_hispanic_white 7.599e-01 9.206e-02 8.255 2.28e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.04098 on 2924 degrees of freedom
## Multiple R-squared: 0.4087, Adjusted R-squared: 0.4049
## F-statistic: 106.4 on 19 and 2924 DF, p-value: < 2.2e-16
#print(xtable(as.data.frame(sqrt_norm_model$coefficients), type = "latex"))
r2_normal <- function(model){
test <- anova(model)
sse <- test[,2]
names <- rownames(test)
total_sse <- sum(sse)
output <- cbind(names,sse/total_sse)
colnames(output) <- c("variables", "r2")
return(output)
}
#print(r2_normal(sqrt_norm_model))
?regsubsets
sub_set = regsubsets(sqrt(cases_norm) ~ (mask_score + ratio_pop_to_physician + unemploy_rate + HS_grad_rate + flu_vacc_. + uninsured + median_income + pop_size + X._rural + life_expectancy + sim_diversity_index + area + X._democrat + life_expectancy + pop_density + non_hispanic_african_american + american_indian + asian + native_hawaiian + hispanic + non_hispanic_white)^2, data=data.t, nvmax=10, nbest=6, method='forward')
res.sum <- summary(sub_set)
summary(res.sum,all.best=TRUE,matrix=T,matrix.logical=F,df=NULL)
## Length Class Mode
## which 12660 -none- logical
## rsq 60 -none- numeric
## rss 60 -none- numeric
## adjr2 60 -none- numeric
## cp 60 -none- numeric
## bic 60 -none- numeric
## outmat 12600 -none- character
## obj 28 regsubsets list
#max(res.sum$adjr2)
#res.sum$outmat[60,]
res.sum$rsq[60]
## [1] 0.3931536
res.sum$bic[60]
## [1] -654.9958
res.sum$adjr2[60]
## [1] 0.389
res.sum$cp[60]
## [1] 397.9193
labels(which(res.sum$outmat[60,]=='*'))
## [1] "mask_score" "mask_score:unemploy_rate"
## [3] "ratio_pop_to_physician:uninsured" "unemploy_rate:X._democrat"
## [5] "unemploy_rate:non_hispanic_white" "flu_vacc_.:hispanic"
## [7] "median_income:asian" "X._rural:asian"
## [9] "X._rural:hispanic" "area:non_hispanic_white"
sub_set = regsubsets(sqrt(cases_norm) ~ (mask_score + ratio_pop_to_physician + unemploy_rate + HS_grad_rate + flu_vacc_. + uninsured + median_income + pop_size + X._rural + life_expectancy + sim_diversity_index + area + X._democrat + life_expectancy + pop_density + non_hispanic_african_american + american_indian + asian + native_hawaiian + hispanic + non_hispanic_white)^2, data=data.t, nvmax=10, nbest=6, method='backward')
## Warning in warn.extra(rval): model with initial (1) variables was better, and is
## reported
res.sum <- summary(sub_set)
summary(res.sum,all.best=TRUE,matrix=T,matrix.logical=F,df=NULL)
## Length Class Mode
## which 12027 -none- logical
## rsq 57 -none- numeric
## rss 57 -none- numeric
## adjr2 57 -none- numeric
## cp 57 -none- numeric
## bic 57 -none- numeric
## outmat 11970 -none- character
## obj 28 regsubsets list
#max(res.sum$adjr2)
#res.sum$outmat[60,]
res.sum$rsq[57]
## [1] 0.364777
res.sum$bic[57]
## [1] -587.7248
res.sum$adjr2[57]
## [1] 0.3604291
res.sum$cp[57]
## [1] 484.3295
#final_model = lm(sqrt(cases_norm) ~ mask_score:unemploy_rate + mask_score + mask_score:median_income + unemploy_rate:non_hispanic_white + X._rural:X._democrat, data = data_norm)
#final_model = lm(sqrt(cases_norm) ~ mask_score + unemploy_rate:X._rural + uninsured:non_hispanic_white + area:american_indian + mask_score:unemploy_rate + unemploy_rate:X._democrat + median_income:X._rural + ratio_pop_to_physician:asian + unemploy_rate:non_hispanic_white + X._rural:area, data = data.t)
final_model = lm(sqrt(cases_norm) ~ mask_score+mask_score:unemploy_rate + ratio_pop_to_physician:uninsured + ratio_pop_to_physician:uninsured + unemploy_rate:X._democrat + unemploy_rate:non_hispanic_white + flu_vacc_.:hispanic + median_income:asian + X._rural:asian + X._rural:hispanic + area:non_hispanic_white, data = data.t)
summary(final_model)
##
## Call:
## lm(formula = sqrt(cases_norm) ~ mask_score + mask_score:unemploy_rate +
## ratio_pop_to_physician:uninsured + ratio_pop_to_physician:uninsured +
## unemploy_rate:X._democrat + unemploy_rate:non_hispanic_white +
## flu_vacc_.:hispanic + median_income:asian + X._rural:asian +
## X._rural:hispanic + area:non_hispanic_white, data = data.t)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.141643 -0.026402 -0.002281 0.023550 0.246229
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.086e-01 1.163e-02 43.740 < 2e-16 ***
## mask_score -8.594e-02 3.947e-03 -21.772 < 2e-16 ***
## mask_score:unemploy_rate 5.169e-01 6.719e-02 7.694 2.60e-14 ***
## ratio_pop_to_physician:uninsured -3.006e-05 5.079e-06 -5.919 4.03e-09 ***
## unemploy_rate:X._democrat -1.143e+00 2.389e-01 -4.786 1.88e-06 ***
## unemploy_rate:non_hispanic_white -2.568e+00 1.802e-01 -14.256 < 2e-16 ***
## flu_vacc_.:hispanic 1.194e-01 3.869e-02 3.088 0.002055 **
## median_income:asian -2.088e-06 5.341e-07 -3.910 9.65e-05 ***
## asian:X._rural -1.085e+00 2.894e-01 -3.748 0.000185 ***
## hispanic:X._rural -1.165e-01 2.289e-02 -5.089 4.07e-07 ***
## non_hispanic_white:area -6.762e-06 1.286e-06 -5.258 1.68e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.04204 on 1461 degrees of freedom
## Multiple R-squared: 0.3932, Adjusted R-squared: 0.389
## F-statistic: 94.65 on 10 and 1461 DF, p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(final_model)
final_model$coefficients
## (Intercept) mask_score
## 5.086201e-01 -8.593599e-02
## mask_score:unemploy_rate ratio_pop_to_physician:uninsured
## 5.169467e-01 -3.005965e-05
## unemploy_rate:X._democrat unemploy_rate:non_hispanic_white
## -1.143084e+00 -2.568189e+00
## flu_vacc_.:hispanic median_income:asian
## 1.194486e-01 -2.088325e-06
## asian:X._rural hispanic:X._rural
## -1.084744e+00 -1.164722e-01
## non_hispanic_white:area
## -6.762233e-06
get_model_stats <- function(cand_model, full_model, data.t, data.v){
adj_r2_train <- summary(cand_model)$adj.r.squared
n <- nrow(data.t)
sse_fs1 <- anova(cand_model)["Residuals", 2] #SSE
mse_fs1 <- anova(cand_model)["Residuals", 3] #MSE
mse_full <- anova(full_model)["Residuals", 3] #MSE for full model
p_fs1 <- length(cand_model$coefficients) #number of coefficients
Cp_fs1 <- sse_fs1/mse_full - (n - 2*p_fs1)
aic_fs1 <- n*log(sse_fs1/n) + 2*p_fs1
e.fs1=cand_model$residuals # residuals
h.fs1=influence(cand_model)$hat # diagonals of hat matrix
press.fs1= sum(e.fs1^2/(1-h.fs1)^2) # calculate pressP
#Get MSPE_v from new data for model 1
newdata = data.v[, 1:27]
y.hat = predict(cand_model, newdata)
MSPE = mean((data.v$cases_norm- y.hat)^2)
sse_t = sum(cand_model$residuals^2)
output <- c(Cp_fs1, p_fs1, aic_fs1, press.fs1, adj_r2_train, MSPE, sse_t/nrow(data.t), press.fs1/nrow(data.t))
return(output)
}
# train models with more variables...
none_mod = lm(sqrt(cases_norm)~1, data=data.t)
model_full_2order = lm(sqrt(cases_norm)~(mask_score + ratio_pop_to_physician + unemploy_rate + HS_grad_rate + flu_vacc_. + uninsured + median_income + pop_size + X._rural + life_expectancy + sim_diversity_index + area + X._democrat + life_expectancy + non_hispanic_african_american + american_indian + asian + native_hawaiian + hispanic + non_hispanic_white)^2, data=data.t)
full_mod = lm(sqrt(cases_norm)~(mask_score + ratio_pop_to_physician + unemploy_rate + HS_grad_rate + flu_vacc_. + uninsured + median_income + pop_size + X._rural + life_expectancy + sim_diversity_index + area + X._democrat + life_expectancy + pop_density + non_hispanic_african_american + american_indian + asian + native_hawaiian + hispanic + non_hispanic_white)^2, data=data_norm)
model_AIC_2order_3 <- stepAIC(none_mod, scope=list(upper=full_mod, lower = ~1), direction="forward", k=2, trace = FALSE, steps = 3)
model_AIC_2order_5 <- stepAIC(none_mod, scope=list(upper=full_mod, lower = ~1), direction="forward", k=2, trace = FALSE, steps = 5)
model_AIC_2order_7 <- stepAIC(none_mod, scope=list(upper=full_mod, lower = ~1), direction="forward", k=2, trace = FALSE, steps = 7)
model_AIC_2order_10 <- stepAIC(none_mod, scope=list(upper=full_mod, lower = ~1), direction="forward", k=2, trace = FALSE, steps = 10)
model_AIC_2order_15 <- stepAIC(none_mod, scope=list(upper=full_mod, lower = ~1), direction="forward", k=2, trace = FALSE, steps = 15)
model_AIC_2order_20 <- stepAIC(none_mod, scope=list(upper=full_mod, lower = ~1), direction="forward", k=2, trace = FALSE, steps = 20)
model_AIC_2order_25 <- stepAIC(none_mod, scope=list(upper=full_mod, lower = ~1), direction="forward", k=2, trace = FALSE, steps = 25)
model_AIC_2order_30 <- stepAIC(none_mod, scope=list(upper=full_mod, lower = ~1), direction="forward", k=2, trace = FALSE, steps = 30)
?regsubsets
a <- get_model_stats(model_AIC_2order_10, model_full_2order, data.t, data.v)
b <- get_model_stats(model_AIC_2order_15, model_full_2order, data.t, data.v)
c <- get_model_stats(model_AIC_2order_20, model_full_2order, data.t, data.v)
d <- get_model_stats(model_AIC_2order_25, model_full_2order, data.t, data.v)
e <- get_model_stats(model_AIC_2order_30, model_full_2order, data.t, data.v)
f <- get_model_stats(model_full_2order, model_full_2order, data.t, data.v)
x <- get_model_stats(model_AIC_2order_3, model_full_2order, data.t, data.v)
y <- get_model_stats(model_AIC_2order_5, model_full_2order, data.t, data.v)
z <- get_model_stats(model_AIC_2order_7, model_full_2order, data.t, data.v)
r <- get_model_stats(final_model, model_full_2order, data.t, data.v)
compare_models <- rbind(x,y,z,a,b,c,d,e,f,r)
rownames(compare_models) <- c("3 var","5 var","7 var","10 var","15 var","20 var","25 var","30 var","full model","regsubsetsmodel")
colnames(compare_models) <- c("Cp", "p", "aic", "pressP", "adj_r2", "MSPE", "sse/n", "pressP/n")
as.data.frame(compare_models)
## Cp p aic pressP adj_r2 MSPE
## 3 var 601.8137 4 -9171.939 2.898576 0.3217763 0.02476102
## 5 var 532.8505 6 -9220.870 2.806522 0.3448381 0.02486011
## 7 var 494.8167 8 -9248.250 2.757549 0.3577809 0.02493503
## 10 var 440.1028 11 -9288.790 2.686792 0.3764905 0.02496562
## 15 var 344.3974 16 -9363.504 2.548177 0.4093393 0.02488677
## 20 var 272.2915 21 -9422.843 2.460923 0.4345758 0.02507176
## 25 var 239.8135 26 -9450.041 2.430395 0.4467787 0.02510464
## 30 var 202.3095 31 -9482.784 2.379926 0.4607474 0.02507898
## full model 191.0000 191 -9501.383 3.272378 0.5180435 0.02552738
## regsubsetsmodel 402.1817 11 -9318.623 2.632862 0.3890000 0.02520130
## sse/n pressP/n
## 3 var 0.001956943 0.001969141
## 5 var 0.001887825 0.001906604
## 7 var 0.001848006 0.001873335
## 10 var 0.001790492 0.001825266
## 15 var 0.001690358 0.001731099
## 20 var 0.001612579 0.001671823
## 25 var 0.001572340 0.001651083
## 30 var 0.001527339 0.001616798
## full model 0.001213490 0.002223083
## regsubsetsmodel 0.001754570 0.001788629
library(xtable)
?print.xtable
print(xtable(as.data.frame(compare_models), type = "latex"))
## % latex table generated in R 4.0.3 by xtable 1.8-4 package
## % Sun Dec 13 17:47:07 2020
## \begin{table}[ht]
## \centering
## \begin{tabular}{rrrrrrrrr}
## \hline
## & Cp & p & aic & pressP & adj\_r2 & MSPE & sse/n & pressP/n \\
## \hline
## 3 var & 601.81 & 4.00 & -9171.94 & 2.90 & 0.32 & 0.02 & 0.00 & 0.00 \\
## 5 var & 532.85 & 6.00 & -9220.87 & 2.81 & 0.34 & 0.02 & 0.00 & 0.00 \\
## 7 var & 494.82 & 8.00 & -9248.25 & 2.76 & 0.36 & 0.02 & 0.00 & 0.00 \\
## 10 var & 440.10 & 11.00 & -9288.79 & 2.69 & 0.38 & 0.02 & 0.00 & 0.00 \\
## 15 var & 344.40 & 16.00 & -9363.50 & 2.55 & 0.41 & 0.02 & 0.00 & 0.00 \\
## 20 var & 272.29 & 21.00 & -9422.84 & 2.46 & 0.43 & 0.03 & 0.00 & 0.00 \\
## 25 var & 239.81 & 26.00 & -9450.04 & 2.43 & 0.45 & 0.03 & 0.00 & 0.00 \\
## 30 var & 202.31 & 31.00 & -9482.78 & 2.38 & 0.46 & 0.03 & 0.00 & 0.00 \\
## full model & 191.00 & 191.00 & -9501.38 & 3.27 & 0.52 & 0.03 & 0.00 & 0.00 \\
## regsubsetsmodel & 402.18 & 11.00 & -9318.62 & 2.63 & 0.39 & 0.03 & 0.00 & 0.00 \\
## \hline
## \end{tabular}
## \end{table}
plot(as.data.frame(compare_models)$aic)
plot(as.data.frame(compare_models)$Cp)
plot(as.data.frame(compare_models)$adj_r2)
final_model = lm(sqrt(cases_norm) ~ mask_score+mask_score:unemploy_rate + ratio_pop_to_physician:uninsured + ratio_pop_to_physician:uninsured + unemploy_rate:X._democrat + unemploy_rate:non_hispanic_white + flu_vacc_.:hispanic + median_income:asian + X._rural:asian + X._rural:hispanic + area:non_hispanic_white, data = data_norm)
r2_normal <- function(model){
test <- anova(model)
sse <- test[,2]
names <- rownames(test)
total_sse <- sum(sse)
output <- cbind(names,sse/total_sse)
colnames(output) <- c("variables", "r2")
return(output)
}
summary(final_model)
##
## Call:
## lm(formula = sqrt(cases_norm) ~ mask_score + mask_score:unemploy_rate +
## ratio_pop_to_physician:uninsured + ratio_pop_to_physician:uninsured +
## unemploy_rate:X._democrat + unemploy_rate:non_hispanic_white +
## flu_vacc_.:hispanic + median_income:asian + X._rural:asian +
## X._rural:hispanic + area:non_hispanic_white, data = data_norm)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.136577 -0.026452 -0.001916 0.023980 0.246871
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.982e-01 7.644e-03 65.168 < 2e-16 ***
## mask_score -8.319e-02 2.580e-03 -32.251 < 2e-16 ***
## mask_score:unemploy_rate 4.865e-01 4.202e-02 11.579 < 2e-16 ***
## ratio_pop_to_physician:uninsured -1.383e-05 2.511e-06 -5.509 3.93e-08 ***
## unemploy_rate:X._democrat -1.118e+00 1.560e-01 -7.164 9.87e-13 ***
## unemploy_rate:non_hispanic_white -2.538e+00 1.167e-01 -21.755 < 2e-16 ***
## flu_vacc_.:hispanic 1.175e-01 2.431e-02 4.835 1.40e-06 ***
## median_income:asian -1.785e-06 3.720e-07 -4.799 1.68e-06 ***
## asian:X._rural -7.132e-01 1.968e-01 -3.625 0.000294 ***
## hispanic:X._rural -1.309e-01 1.581e-02 -8.275 < 2e-16 ***
## non_hispanic_white:area -5.858e-06 8.639e-07 -6.781 1.43e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.04127 on 2933 degrees of freedom
## Multiple R-squared: 0.3985, Adjusted R-squared: 0.3964
## F-statistic: 194.3 on 10 and 2933 DF, p-value: < 2.2e-16
df_final_model = as.data.frame(final_model$coefficients)
df_final_model$r2 = c('NA',r2_normal(final_model)[,2][1:10])
df_final_model
## final_model$coefficients r2
## (Intercept) 4.981737e-01 NA
## mask_score -8.319280e-02 0.181703380784454
## mask_score:unemploy_rate 4.865244e-01 0.00281206648342403
## ratio_pop_to_physician:uninsured -1.383041e-05 2.14400529371138e-05
## unemploy_rate:X._democrat -1.117925e+00 0.0431924280537644
## unemploy_rate:non_hispanic_white -2.537833e+00 0.136023534260933
## flu_vacc_.:hispanic 1.175145e-01 6.98851182152494e-08
## median_income:asian -1.785194e-06 0.00311190366257682
## asian:X._rural -7.132081e-01 0.00750359975974371
## hispanic:X._rural -1.308632e-01 0.0146790170369514
## non_hispanic_white:area -5.858479e-06 0.00943166450959449
print(xtable(as.data.frame(df_final_model), type = "latex"))
## % latex table generated in R 4.0.3 by xtable 1.8-4 package
## % Sun Dec 13 17:47:07 2020
## \begin{table}[ht]
## \centering
## \begin{tabular}{rrl}
## \hline
## & final\_model\$coefficients & r2 \\
## \hline
## (Intercept) & 0.50 & NA \\
## mask\_score & -0.08 & 0.181703380784454 \\
## mask\_score:unemploy\_rate & 0.49 & 0.00281206648342403 \\
## ratio\_pop\_to\_physician:uninsured & -0.00 & 2.14400529371138e-05 \\
## unemploy\_rate:X.\_democrat & -1.12 & 0.0431924280537644 \\
## unemploy\_rate:non\_hispanic\_white & -2.54 & 0.136023534260933 \\
## flu\_vacc\_.:hispanic & 0.12 & 6.98851182152494e-08 \\
## median\_income:asian & -0.00 & 0.00311190366257682 \\
## asian:X.\_rural & -0.71 & 0.00750359975974371 \\
## hispanic:X.\_rural & -0.13 & 0.0146790170369514 \\
## non\_hispanic\_white:area & -0.00 & 0.00943166450959449 \\
## \hline
## \end{tabular}
## \end{table}
Obtain the studentized deleted residuals and identify any outlying Y observations. Use the Bonferroni outlier test procedure at α = 0.05.
final_model = lm(sqrt(cases_norm) ~ mask_score+mask_score:unemploy_rate + ratio_pop_to_physician:uninsured + ratio_pop_to_physician:uninsured + unemploy_rate:X._democrat + unemploy_rate:non_hispanic_white + flu_vacc_.:hispanic + median_income:asian + X._rural:asian + X._rural:hispanic + area:non_hispanic_white, data = data_norm)
n=length(data_norm)
p=11
rsd.lm=round(rstudent(final_model), 3)
result = ifelse(rsd.lm > qt(1-0.95/2/n,n-p-1), TRUE, FALSE)
table(result)
## result
## FALSE TRUE
## 2901 43
outliers = names(which(result))
outliers
## [1] "71" "116" "121" "122" "227" "316" "376" "477" "548" "552"
## [11] "748" "761" "790" "903" "969" "1002" "1014" "1076" "1196" "1211"
## [21] "1222" "1226" "1240" "1309" "1538" "1581" "1603" "1648" "1906" "2210"
## [31] "2212" "2230" "2245" "2256" "2315" "2352" "2358" "2414" "2577" "2868"
## [41] "2875" "2880" "2928"
final_model = lm(sqrt(cases_norm) ~ mask_score+mask_score:unemploy_rate + ratio_pop_to_physician:uninsured + ratio_pop_to_physician:uninsured + unemploy_rate:X._democrat + unemploy_rate:non_hispanic_white + flu_vacc_.:hispanic + median_income:asian + X._rural:asian + X._rural:hispanic + area:non_hispanic_white, data = data_norm)
par(mfrow = c(2,2))
plot(final_model)
plot(final_model)
#data_norm
#data_norm[-c(2577),]
colnames(data_norm)
## [1] "X.1" "X"
## [3] "fips" "county"
## [5] "state" "cases"
## [7] "deaths" "mask_score"
## [9] "ratio_pop_to_physician" "unemploy_rate"
## [11] "HS_grad_rate" "flu_vacc_."
## [13] "uninsured" "median_income"
## [15] "pop_size" "X._rural"
## [17] "life_expectancy" "non_hispanic_african_american"
## [19] "american_indian" "asian"
## [21] "native_hawaiian" "hispanic"
## [23] "non_hispanic_white" "sim_diversity_index"
## [25] "X._democrat" "area"
## [27] "pop_density" "cases_norm"
for (i in colnames(data_norm)){
print(i)
print(summary(eval(call('$', data_norm, i))))
}
## [1] "X.1"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 761.8 1506.5 1539.2 2330.2 3109.0
## [1] "X"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.0 842.8 1598.5 1633.3 2442.5 3234.0
## [1] "fips"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1001 18182 29100 30355 45319 56045
## [1] "county"
## Length Class Mode
## 2944 character character
## [1] "state"
## Length Class Mode
## 2944 character character
## [1] "cases"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 11.0 469.5 1101.5 4073.6 2758.5 370729.0
## [1] "deaths"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 6.00 17.00 78.48 48.00 7446.00
## [1] "mask_score"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.433 2.701 3.000 2.995 3.296 3.849
## [1] "ratio_pop_to_physician"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3936 1048 1498 1916 2202 51307
## [1] "unemploy_rate"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.01624 0.03567 0.04388 0.04619 0.05365 0.19066
## [1] "HS_grad_rate"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.3630 0.8467 0.8951 0.8838 0.9344 1.0000
## [1] "flu_vacc_."
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0700 0.3500 0.4200 0.4087 0.4800 0.6500
## [1] "uninsured"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.02068 0.07036 0.10272 0.10937 0.13841 0.30877
## [1] "median_income"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 24783 42208 48728 50996 56546 136191
## [1] "pop_size"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1718 13074 28270 107303 73641 10163507
## [1] "X._rural"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.3222 0.5739 0.5678 0.8149 1.0000
## [1] "life_expectancy"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 66.50 75.49 77.48 77.40 79.22 97.97
## [1] "non_hispanic_african_american"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0005099 0.0073038 0.0244137 0.0925949 0.1061252 0.8532961
## [1] "american_indian"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000496 0.003656 0.005952 0.019425 0.012304 0.879781
## [1] "asian"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000000 0.004604 0.007189 0.015009 0.014113 0.430067
## [1] "native_hawaiian"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000000 0.0003171 0.0005949 0.0012039 0.0011243 0.1312001
## [1] "hispanic"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.005151 0.022798 0.042757 0.094767 0.096909 0.963230
## [1] "non_hispanic_white"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0276 0.6470 0.8371 0.7647 0.9250 0.9792
## [1] "sim_diversity_index"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.04102 0.14154 0.28553 0.31371 0.48660 0.82897
## [1] "X._democrat"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.06955 0.21860 0.30358 0.33789 0.42621 0.95695
## [1] "area"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.0 426.7 599.9 946.6 900.2 20056.9
## [1] "pop_density"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.50 20.98 48.40 214.13 123.85 17179.10
## [1] "cases_norm"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00140 0.02720 0.04001 0.04221 0.05357 0.19353
data_norm["2577",]
## X.1 X fips county state cases deaths mask_score
## 2577 2727 2844 48473 Waller Texas 1113 18 3.38
## ratio_pop_to_physician unemploy_rate HS_grad_rate flu_vacc_. uninsured
## 2577 51307 0.0486915 0.947 0.42 0.2385323
## median_income pop_size X._rural life_expectancy
## 2577 55224 51307 0.6164101 79.05264
## non_hispanic_african_american american_indian asian native_hawaiian
## 2577 0.2453856 0.01403317 0.01075877 0.001052488
## hispanic non_hispanic_white sim_diversity_index X._democrat area
## 2577 0.3012844 0.4287719 0.6648545 0.3530929 513.43
## pop_density cases_norm
## 2577 84.1 0.02169295
final_model = lm(sqrt(cases_norm) ~ mask_score+mask_score:unemploy_rate + ratio_pop_to_physician:uninsured + ratio_pop_to_physician:uninsured + unemploy_rate:X._democrat + unemploy_rate:non_hispanic_white + flu_vacc_.:hispanic + median_income:asian + X._rural:asian + X._rural:hispanic + area:non_hispanic_white, data = data_norm)
final_model_no_outlier = lm(sqrt(cases_norm) ~ mask_score+mask_score:unemploy_rate + ratio_pop_to_physician:uninsured + ratio_pop_to_physician:uninsured + unemploy_rate:X._democrat + unemploy_rate:non_hispanic_white + flu_vacc_.:hispanic + median_income:asian + X._rural:asian + X._rural:hispanic + area:non_hispanic_white, data = data_norm[-c(2577),])
a <- final_model$fitted.value
b <- predict(final_model_no_outlier, data_norm)
#a
#b
plot(a, b, xlab="fitted values using all cases", ylab="fitted values without outliers") ## compare fitted values
abline(0,1)
mean(abs(a-b)/a*100)
## [1] 0.7263656
par(mfrow = c(2,2))
plot(final_model)
plot(final_model_no_outlier)
summary(final_model_no_outlier)
##
## Call:
## lm(formula = sqrt(cases_norm) ~ mask_score + mask_score:unemploy_rate +
## ratio_pop_to_physician:uninsured + ratio_pop_to_physician:uninsured +
## unemploy_rate:X._democrat + unemploy_rate:non_hispanic_white +
## flu_vacc_.:hispanic + median_income:asian + X._rural:asian +
## X._rural:hispanic + area:non_hispanic_white, data = data_norm[-c(2577),
## ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.138711 -0.026040 -0.001925 0.024246 0.249273
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.033e-01 7.744e-03 64.986 < 2e-16 ***
## mask_score -8.406e-02 2.584e-03 -32.533 < 2e-16 ***
## mask_score:unemploy_rate 5.126e-01 4.248e-02 12.066 < 2e-16 ***
## ratio_pop_to_physician:uninsured -2.368e-05 3.608e-06 -6.561 6.29e-11 ***
## unemploy_rate:X._democrat -1.210e+00 1.576e-01 -7.681 2.14e-14 ***
## unemploy_rate:non_hispanic_white -2.608e+00 1.179e-01 -22.129 < 2e-16 ***
## flu_vacc_.:hispanic 1.145e-01 2.426e-02 4.719 2.48e-06 ***
## median_income:asian -1.837e-06 3.714e-07 -4.947 7.97e-07 ***
## asian:X._rural -7.141e-01 1.963e-01 -3.637 0.00028 ***
## hispanic:X._rural -1.275e-01 1.580e-02 -8.066 1.05e-15 ***
## non_hispanic_white:area -6.062e-06 8.636e-07 -7.020 2.75e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.04118 on 2932 degrees of freedom
## Multiple R-squared: 0.4012, Adjusted R-squared: 0.3992
## F-statistic: 196.5 on 10 and 2932 DF, p-value: < 2.2e-16
boxplot(final_model_no_outlier$coefficients-final_model$coefficients)
summary(final_model)
##
## Call:
## lm(formula = sqrt(cases_norm) ~ mask_score + mask_score:unemploy_rate +
## ratio_pop_to_physician:uninsured + ratio_pop_to_physician:uninsured +
## unemploy_rate:X._democrat + unemploy_rate:non_hispanic_white +
## flu_vacc_.:hispanic + median_income:asian + X._rural:asian +
## X._rural:hispanic + area:non_hispanic_white, data = data_norm)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.136577 -0.026452 -0.001916 0.023980 0.246871
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.982e-01 7.644e-03 65.168 < 2e-16 ***
## mask_score -8.319e-02 2.580e-03 -32.251 < 2e-16 ***
## mask_score:unemploy_rate 4.865e-01 4.202e-02 11.579 < 2e-16 ***
## ratio_pop_to_physician:uninsured -1.383e-05 2.511e-06 -5.509 3.93e-08 ***
## unemploy_rate:X._democrat -1.118e+00 1.560e-01 -7.164 9.87e-13 ***
## unemploy_rate:non_hispanic_white -2.538e+00 1.167e-01 -21.755 < 2e-16 ***
## flu_vacc_.:hispanic 1.175e-01 2.431e-02 4.835 1.40e-06 ***
## median_income:asian -1.785e-06 3.720e-07 -4.799 1.68e-06 ***
## asian:X._rural -7.132e-01 1.968e-01 -3.625 0.000294 ***
## hispanic:X._rural -1.309e-01 1.581e-02 -8.275 < 2e-16 ***
## non_hispanic_white:area -5.858e-06 8.639e-07 -6.781 1.43e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.04127 on 2933 degrees of freedom
## Multiple R-squared: 0.3985, Adjusted R-squared: 0.3964
## F-statistic: 194.3 on 10 and 2933 DF, p-value: < 2.2e-16
df_final_model = as.data.frame(final_model$coefficients)
df_final_model$r2 = c('NA',r2_normal(final_model)[,2][1:10])
df_final_model
## final_model$coefficients r2
## (Intercept) 4.981737e-01 NA
## mask_score -8.319280e-02 0.181703380784454
## mask_score:unemploy_rate 4.865244e-01 0.00281206648342403
## ratio_pop_to_physician:uninsured -1.383041e-05 2.14400529371138e-05
## unemploy_rate:X._democrat -1.117925e+00 0.0431924280537644
## unemploy_rate:non_hispanic_white -2.537833e+00 0.136023534260933
## flu_vacc_.:hispanic 1.175145e-01 6.98851182152494e-08
## median_income:asian -1.785194e-06 0.00311190366257682
## asian:X._rural -7.132081e-01 0.00750359975974371
## hispanic:X._rural -1.308632e-01 0.0146790170369514
## non_hispanic_white:area -5.858479e-06 0.00943166450959449
#print(xtable(as.data.frame(df_final_model), type = "latex"))
summary(final_model_no_outlier)
##
## Call:
## lm(formula = sqrt(cases_norm) ~ mask_score + mask_score:unemploy_rate +
## ratio_pop_to_physician:uninsured + ratio_pop_to_physician:uninsured +
## unemploy_rate:X._democrat + unemploy_rate:non_hispanic_white +
## flu_vacc_.:hispanic + median_income:asian + X._rural:asian +
## X._rural:hispanic + area:non_hispanic_white, data = data_norm[-c(2577),
## ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.138711 -0.026040 -0.001925 0.024246 0.249273
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.033e-01 7.744e-03 64.986 < 2e-16 ***
## mask_score -8.406e-02 2.584e-03 -32.533 < 2e-16 ***
## mask_score:unemploy_rate 5.126e-01 4.248e-02 12.066 < 2e-16 ***
## ratio_pop_to_physician:uninsured -2.368e-05 3.608e-06 -6.561 6.29e-11 ***
## unemploy_rate:X._democrat -1.210e+00 1.576e-01 -7.681 2.14e-14 ***
## unemploy_rate:non_hispanic_white -2.608e+00 1.179e-01 -22.129 < 2e-16 ***
## flu_vacc_.:hispanic 1.145e-01 2.426e-02 4.719 2.48e-06 ***
## median_income:asian -1.837e-06 3.714e-07 -4.947 7.97e-07 ***
## asian:X._rural -7.141e-01 1.963e-01 -3.637 0.00028 ***
## hispanic:X._rural -1.275e-01 1.580e-02 -8.066 1.05e-15 ***
## non_hispanic_white:area -6.062e-06 8.636e-07 -7.020 2.75e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.04118 on 2932 degrees of freedom
## Multiple R-squared: 0.4012, Adjusted R-squared: 0.3992
## F-statistic: 196.5 on 10 and 2932 DF, p-value: < 2.2e-16
df_final_model = as.data.frame(final_model_no_outlier$coefficients)
df_final_model$r2 = c('NA',r2_normal(final_model_no_outlier)[,2][1:10])
df_final_model
## final_model_no_outlier$coefficients
## (Intercept) 5.032552e-01
## mask_score -8.405843e-02
## mask_score:unemploy_rate 5.126312e-01
## ratio_pop_to_physician:uninsured -2.367590e-05
## unemploy_rate:X._democrat -1.210433e+00
## unemploy_rate:non_hispanic_white -2.608188e+00
## flu_vacc_.:hispanic 1.145099e-01
## median_income:asian -1.837365e-06
## asian:X._rural -7.141138e-01
## hispanic:X._rural -1.274679e-01
## non_hispanic_white:area -6.062256e-06
## r2
## (Intercept) NA
## mask_score 0.18155105239686
## mask_score:unemploy_rate 0.00281273741313736
## ratio_pop_to_physician:uninsured 1.29792732660245e-05
## unemploy_rate:X._democrat 0.0436781444233098
## unemploy_rate:non_hispanic_white 0.138251201500967
## flu_vacc_.:hispanic 1.5217923565925e-06
## median_income:asian 0.00342050329718961
## asian:X._rural 0.00742418461362043
## hispanic:X._rural 0.014008053183516
## non_hispanic_white:area 0.0100632438989106
print(xtable(as.data.frame(df_final_model), type = "latex"))
## % latex table generated in R 4.0.3 by xtable 1.8-4 package
## % Sun Dec 13 17:47:11 2020
## \begin{table}[ht]
## \centering
## \begin{tabular}{rrl}
## \hline
## & final\_model\_no\_outlier\$coefficients & r2 \\
## \hline
## (Intercept) & 0.50 & NA \\
## mask\_score & -0.08 & 0.18155105239686 \\
## mask\_score:unemploy\_rate & 0.51 & 0.00281273741313736 \\
## ratio\_pop\_to\_physician:uninsured & -0.00 & 1.29792732660245e-05 \\
## unemploy\_rate:X.\_democrat & -1.21 & 0.0436781444233098 \\
## unemploy\_rate:non\_hispanic\_white & -2.61 & 0.138251201500967 \\
## flu\_vacc\_.:hispanic & 0.11 & 1.5217923565925e-06 \\
## median\_income:asian & -0.00 & 0.00342050329718961 \\
## asian:X.\_rural & -0.71 & 0.00742418461362043 \\
## hispanic:X.\_rural & -0.13 & 0.014008053183516 \\
## non\_hispanic\_white:area & -0.00 & 0.0100632438989106 \\
## \hline
## \end{tabular}
## \end{table}